import os
import jieba
import re
from course_planning.settings import BASE_DIR

STOPWORDS_FIST = os.path.join(BASE_DIR, 'course/service/hit_stopwords.txt')


def pre_procession_text(question):
    # 文本预处理 ：去除一些无用的字符只提取出中文出来
    new_data = re.findall('[\u4e00-\u9fa5]+', question, re.S)
    new_data = " ".join(new_data)
    # 文本分词
    seg_list_exact = jieba.lcut(new_data)
    result_list = []
    # 读取停用词库
    with open(STOPWORDS_FIST, 'r', encoding='utf-8') as f:  # 可根据需要打开停用词库，然后加上不想显示的词语
        con = f.readlines()
        stop_words = set()  # 集合可以去重
        for i in con:
            i = i.replace("\n", "")  # 去掉读取每一行数据的\n
            stop_words.add(i)
    # 去除停用词
    for word in seg_list_exact:
        if word not in stop_words:
            result_list.append(word)
    return result_list
